In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#To read CSV file
df = pd.read_csv("C:/Users/HARISHKUMAR/Downloads/breast-cancer.csv")
df
Out[1]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 ... 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 ... 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 NaN
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 ... 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 NaN
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 ... 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 NaN
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 ... 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 NaN
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 ... 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 NaN

569 rows × 33 columns

In [2]:
df.shape
Out[2]:
(569, 33)
In [3]:
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})
In [4]:
df.head()
Out[4]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

In [5]:
df.tail()
Out[5]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
564 926424 1 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 ... 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115 NaN
565 926682 1 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 ... 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637 NaN
566 926954 1 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 ... 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820 NaN
567 927241 1 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 ... 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400 NaN
568 92751 0 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 ... 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039 NaN

5 rows × 33 columns

In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    int64  
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
 32  Unnamed: 32              0 non-null      float64
dtypes: float64(31), int64(2)
memory usage: 146.8 KB
In [7]:
df.isnull().sum()
Out[7]:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: int64
In [8]:
df.nunique()
Out[8]:
id                         569
diagnosis                    2
radius_mean                456
texture_mean               479
perimeter_mean             522
area_mean                  539
smoothness_mean            474
compactness_mean           537
concavity_mean             537
concave points_mean        542
symmetry_mean              432
fractal_dimension_mean     499
radius_se                  540
texture_se                 519
perimeter_se               533
area_se                    528
smoothness_se              547
compactness_se             541
concavity_se               533
concave points_se          507
symmetry_se                498
fractal_dimension_se       545
radius_worst               457
texture_worst              511
perimeter_worst            514
area_worst                 544
smoothness_worst           411
compactness_worst          529
concavity_worst            539
concave points_worst       492
symmetry_worst             500
fractal_dimension_worst    535
Unnamed: 32                  0
dtype: int64
In [9]:
df.dtypes
Out[9]:
id                           int64
diagnosis                    int64
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
Unnamed: 32                float64
dtype: object
In [10]:
df.describe().T
Out[10]:
count mean std min 25% 50% 75% max
id 569.0 3.037183e+07 1.250206e+08 8670.000000 869218.000000 906024.000000 8.813129e+06 9.113205e+08
diagnosis 569.0 3.725835e-01 4.839180e-01 0.000000 0.000000 0.000000 1.000000e+00 1.000000e+00
radius_mean 569.0 1.412729e+01 3.524049e+00 6.981000 11.700000 13.370000 1.578000e+01 2.811000e+01
texture_mean 569.0 1.928965e+01 4.301036e+00 9.710000 16.170000 18.840000 2.180000e+01 3.928000e+01
perimeter_mean 569.0 9.196903e+01 2.429898e+01 43.790000 75.170000 86.240000 1.041000e+02 1.885000e+02
area_mean 569.0 6.548891e+02 3.519141e+02 143.500000 420.300000 551.100000 7.827000e+02 2.501000e+03
smoothness_mean 569.0 9.636028e-02 1.406413e-02 0.052630 0.086370 0.095870 1.053000e-01 1.634000e-01
compactness_mean 569.0 1.043410e-01 5.281276e-02 0.019380 0.064920 0.092630 1.304000e-01 3.454000e-01
concavity_mean 569.0 8.879932e-02 7.971981e-02 0.000000 0.029560 0.061540 1.307000e-01 4.268000e-01
concave points_mean 569.0 4.891915e-02 3.880284e-02 0.000000 0.020310 0.033500 7.400000e-02 2.012000e-01
symmetry_mean 569.0 1.811619e-01 2.741428e-02 0.106000 0.161900 0.179200 1.957000e-01 3.040000e-01
fractal_dimension_mean 569.0 6.279761e-02 7.060363e-03 0.049960 0.057700 0.061540 6.612000e-02 9.744000e-02
radius_se 569.0 4.051721e-01 2.773127e-01 0.111500 0.232400 0.324200 4.789000e-01 2.873000e+00
texture_se 569.0 1.216853e+00 5.516484e-01 0.360200 0.833900 1.108000 1.474000e+00 4.885000e+00
perimeter_se 569.0 2.866059e+00 2.021855e+00 0.757000 1.606000 2.287000 3.357000e+00 2.198000e+01
area_se 569.0 4.033708e+01 4.549101e+01 6.802000 17.850000 24.530000 4.519000e+01 5.422000e+02
smoothness_se 569.0 7.040979e-03 3.002518e-03 0.001713 0.005169 0.006380 8.146000e-03 3.113000e-02
compactness_se 569.0 2.547814e-02 1.790818e-02 0.002252 0.013080 0.020450 3.245000e-02 1.354000e-01
concavity_se 569.0 3.189372e-02 3.018606e-02 0.000000 0.015090 0.025890 4.205000e-02 3.960000e-01
concave points_se 569.0 1.179614e-02 6.170285e-03 0.000000 0.007638 0.010930 1.471000e-02 5.279000e-02
symmetry_se 569.0 2.054230e-02 8.266372e-03 0.007882 0.015160 0.018730 2.348000e-02 7.895000e-02
fractal_dimension_se 569.0 3.794904e-03 2.646071e-03 0.000895 0.002248 0.003187 4.558000e-03 2.984000e-02
radius_worst 569.0 1.626919e+01 4.833242e+00 7.930000 13.010000 14.970000 1.879000e+01 3.604000e+01
texture_worst 569.0 2.567722e+01 6.146258e+00 12.020000 21.080000 25.410000 2.972000e+01 4.954000e+01
perimeter_worst 569.0 1.072612e+02 3.360254e+01 50.410000 84.110000 97.660000 1.254000e+02 2.512000e+02
area_worst 569.0 8.805831e+02 5.693570e+02 185.200000 515.300000 686.500000 1.084000e+03 4.254000e+03
smoothness_worst 569.0 1.323686e-01 2.283243e-02 0.071170 0.116600 0.131300 1.460000e-01 2.226000e-01
compactness_worst 569.0 2.542650e-01 1.573365e-01 0.027290 0.147200 0.211900 3.391000e-01 1.058000e+00
concavity_worst 569.0 2.721885e-01 2.086243e-01 0.000000 0.114500 0.226700 3.829000e-01 1.252000e+00
concave points_worst 569.0 1.146062e-01 6.573234e-02 0.000000 0.064930 0.099930 1.614000e-01 2.910000e-01
symmetry_worst 569.0 2.900756e-01 6.186747e-02 0.156500 0.250400 0.282200 3.179000e-01 6.638000e-01
fractal_dimension_worst 569.0 8.394582e-02 1.806127e-02 0.055040 0.071460 0.080040 9.208000e-02 2.075000e-01
Unnamed: 32 0.0 NaN NaN NaN NaN NaN NaN NaN
In [11]:
sns.countplot(x='diagnosis', data=df)
plt.title('Distribution of Diagnosis')
plt.show()
In [12]:
numerical_columns = df.select_dtypes(exclude=['object']).columns
for col in numerical_columns:
    sns.histplot(df[col], kde=True)
    plt.title(f'Univariate Analysis of {col}')
    plt.show()
In [13]:
sns.pairplot(df, hue = 'diagnosis', vars = ['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean'] )
Out[13]:
<seaborn.axisgrid.PairGrid at 0x26c9c308880>
In [14]:
sns.set_style("darkgrid")
 
# Identify numerical columns
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns
 
# Plot distribution of each numerical feature
plt.figure(figsize=(14, len(numerical_columns) * 3))
for idx, feature in enumerate(numerical_columns, 1):
    plt.subplot(len(numerical_columns), 2, idx)
    sns.histplot(df[feature], kde=True)
    plt.title(f"{feature} | Skewness: {round(df[feature].skew(), 2)}")
plt.tight_layout()
plt.show()
In [15]:
sns.scatterplot(x = 'area_mean', y = 'smoothness_mean', hue = 'diagnosis', data=df)
Out[15]:
<Axes: xlabel='area_mean', ylabel='smoothness_mean'>
In [16]:
sns.scatterplot(x='concavity_se',y= 'radius_mean', hue ='diagnosis', data = df)
Out[16]:
<Axes: xlabel='concavity_se', ylabel='radius_mean'>
In [17]:
sns.scatterplot(x='compactness_se', y='radius_mean', hue ='diagnosis', data = df)
Out[17]:
<Axes: xlabel='compactness_se', ylabel='radius_mean'>
In [18]:
sns.lmplot(x='area_mean', y='smoothness_mean', data=df, hue='diagnosis', fit_reg=False)
Out[18]:
<seaborn.axisgrid.FacetGrid at 0x26ca85e11e0>
In [19]:
sns.catplot(x='texture_mean',y='area_mean',hue='diagnosis',data=df)
Out[19]:
<seaborn.axisgrid.FacetGrid at 0x26ca85e2560>
In [20]:
plt.figure(figsize=(24,12)) 
sns.heatmap(df.corr(), annot=True,fmt='.2f', linewidths=2)
 
plt.title('Correlation Heatmap')
plt.show()
In [21]:
numerical_cols = ['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
from sklearn.preprocessing import StandardScaler
# Scale the numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
In [22]:
df.head()
Out[22]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 1 1.097064 -2.073335 1.269934 0.984375 1.568466 3.283515 2.652874 2.532475 ... -1.359293 2.303601 2.001237 1.307686 2.616665 2.109526 2.296076 2.750622 1.937015 NaN
1 842517 1 1.829821 -0.353632 1.685955 1.908708 -0.826962 -0.487072 -0.023846 0.548144 ... -0.369203 1.535126 1.890489 -0.375612 -0.430444 -0.146749 1.087084 -0.243890 0.281190 NaN
2 84300903 1 1.579888 0.456187 1.566503 1.558884 0.942210 1.052926 1.363478 2.037231 ... -0.023974 1.347475 1.456285 0.527407 1.082932 0.854974 1.955000 1.152255 0.201391 NaN
3 84348301 1 -0.768909 0.253732 -0.592687 -0.764464 3.283553 3.402909 1.915897 1.451707 ... 0.133984 -0.249939 -0.550021 3.394275 3.893397 1.989588 2.175786 6.046041 4.935010 NaN
4 84358402 1 1.750297 -1.151816 1.776573 1.826229 0.280372 0.539340 1.371011 1.428493 ... -1.466770 1.338539 1.220724 0.220556 -0.313395 0.613179 0.729259 -0.868353 -0.397100 NaN

5 rows × 33 columns

In [23]:
unwantedcolumnlist=["diagnosis","Unnamed: 32","id"]
In [24]:
X = df.drop(unwantedcolumnlist,axis=1)
In [25]:
y = df['diagnosis']
In [26]:
simple_random_sample = df.sample(n=60, random_state=42)
print("Simple Random Sampling:")
print(simple_random_sample.head())
Simple Random Sampling:
         id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
204   87930          0    -0.470694     -0.160486       -0.448110  -0.491999   
70   859575          1     1.366877      0.470149        1.302886   1.351264   
131    8670          1     0.378508      0.044296        0.400820   0.267377   
431  907915          0    -0.490575     -0.374576       -0.432457  -0.532101   
540  921385          0    -0.734828     -1.128546       -0.713374  -0.716683   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
204         0.234114          0.027651       -0.109847            -0.276232   
70         -0.446227         -0.027309        0.241064             0.789060   
131         0.913744          0.340350        0.725686             0.824140   
431         0.643316          0.516599       -0.142993            -0.539846   
540         0.247636          0.145150       -0.269044            -0.592724   

     ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
204  ...      -0.168905        -0.333935   -0.356299          0.448503   
70   ...       0.147012         1.746605    1.732277         -0.572873   
131  ...       0.052562         0.525386    0.484159          0.974533   
431  ...      -0.450625        -0.525756   -0.641257          0.553709   
540  ...      -0.976611        -0.848337   -0.743216          0.093432   

     compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
204          -0.104741        -0.024412             -0.199563        0.183204   
70           -0.131459        -0.016736              0.978975       -0.565828   
131          -0.094562         0.512911              0.560244       -0.103143   
431           0.054930        -0.152986             -0.622863       -0.557739   
540          -0.270137        -0.443716             -0.691687       -0.924975   

     fractal_dimension_worst  Unnamed: 32  
204                 0.196958          NaN  
70                 -1.000578          NaN  
131                -0.208132          NaN  
431                 0.534440          NaN  
540                -0.144403          NaN  

[5 rows x 33 columns]
In [27]:
strata = df['diagnosis'].unique()
stratified_sample = df.groupby('diagnosis').apply(lambda x: x.sample(n=50, random_state=42))
print("Stratified Sampling:")
print(stratified_sample.head())
Stratified Sampling:
                   id  diagnosis  radius_mean  texture_mean  perimeter_mean  \
diagnosis                                                                     
0         395  903811          0    -0.019112     -0.490929       -0.091402   
          110  864033          0    -1.235545     -0.535144       -1.213835   
          481   91227          0    -0.064554     -0.011554       -0.133416   
          493  914101          0    -0.473535     -1.503204       -0.541199   
          136  868223          0    -0.686545     -0.609610       -0.710491   

               area_mean  smoothness_mean  compactness_mean  concavity_mean  \
diagnosis                                                                     
0         395  -0.130229        -1.132262         -0.961427       -0.778274   
          110  -1.037213         0.522334         -0.384734       -0.570740   
          481  -0.147862        -1.170692         -0.968060       -0.738851   
          493  -0.505082        -1.611206         -1.211208       -1.024816   
          136  -0.657810         0.621966         -0.822323       -0.663898   

               concave points_mean  ...  texture_worst  perimeter_worst  \
diagnosis                           ...                                   
0         395            -0.423257  ...      -0.054915        -0.322915   
          110            -0.803203  ...      -0.685120        -1.059816   
          481            -0.727884  ...       0.120957        -0.085224   
          493            -0.965447  ...      -1.517252        -0.715492   
          136            -0.591176  ...      -0.032117        -0.628517   

               area_worst  smoothness_worst  compactness_worst  \
diagnosis                                                        
0         395   -0.344697         -1.129589          -0.834393   
          110   -0.902834          0.628230          -0.494694   
          481   -0.088042         -1.138356          -0.717343   
          493   -0.609263         -1.664826          -1.205453   
          136   -0.586937         -0.230954          -0.963529   

               concavity_worst  concave points_worst  symmetry_worst  \
diagnosis                                                              
0         395        -0.899960             -0.540487       -0.611126   
          110        -0.682153             -0.932876       -0.594948   
          481        -0.503205             -0.504095       -0.881295   
          493        -1.225520             -1.336990       -1.004247   
          136        -0.804010             -0.684074       -1.923146   

               fractal_dimension_worst  Unnamed: 32  
diagnosis                                            
0         395                -0.989495          NaN  
          110                 0.040685          NaN  
          481                -0.438661          NaN  
          493                -0.757302          NaN  
          136                -0.582743          NaN  

[5 rows x 33 columns]
In [28]:
systematic_sample = df.iloc[::10, :]
print("Systematic Sampling:")
print(systematic_sample.head())
Systematic Sampling:
         id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302          1     1.097064     -2.073335        1.269934   0.984375   
10   845636          1     0.537556      0.919273        0.442011   0.406453   
20  8510653          0    -0.297446     -0.833008       -0.261106  -0.383638   
30   853401          1     1.278833      1.354435        1.352314   1.231812   
40   855167          1    -0.195201      0.532980       -0.238451  -0.261342   

    smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          1.568466          3.283515        2.652874             2.532475   
10        -1.017686         -0.713542       -0.700684            -0.404686   
20         0.792763          0.429422       -0.541362            -0.459627   
30         0.714481          1.598728        1.796625             1.946952   
40        -1.048999         -0.834452       -0.724413            -0.737944   

    ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \
0   ...      -1.359293         2.303601    2.001237          1.307686   
10  ...       1.335771         0.492622    0.473611         -0.625477   
20  ...      -0.844707        -0.332744   -0.439624         -0.051226   
30  ...       1.356941         1.585762    1.387726          0.733436   
40  ...       0.744648        -0.141817   -0.162929         -1.006849   

    compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \
0            2.616665         2.109526              2.296076        2.750622   
10          -0.630828        -0.605872             -0.226210        0.076431   
20           0.148443        -0.399099             -0.636110        0.458227   
30           1.090566         1.636491              1.068812        0.878850   
40          -0.317847        -0.305547             -0.051865        0.150849   

    fractal_dimension_worst  Unnamed: 32  
0                  1.937015          NaN  
10                 0.031819          NaN  
20                -0.117250          NaN  
30                 0.768849          NaN  
40                -0.691912          NaN  

[5 rows x 33 columns]
In [29]:
df['cluster'] = np.random.choice([0, 1], size=len(df))
cluster_sample = df.groupby('cluster').apply(lambda x: x.sample(frac=0.2, random_state=42))
print("Cluster Sampling:")
print(cluster_sample.head())
Cluster Sampling:
                  id  diagnosis  radius_mean  texture_mean  perimeter_mean  \
cluster                                                                      
0       78   8610862          1     1.719055      1.089149        2.130809   
        254   886226          1     1.511725      0.009390        1.422337   
        404   904969          0    -0.507616     -1.009865       -0.563442   
        282    89122          1     1.497524     -0.258223        1.451171   
        523   917896          0    -0.118517     -0.141869       -0.133416   

             area_mean  smoothness_mean  compactness_mean  concavity_mean  \
cluster                                                                     
0       78    1.678336         2.294354          4.568425        3.598263   
        254   1.462184         0.508101          0.274020        0.616458   
        404  -0.528403        -0.678938         -1.111144       -0.850089   
        282   1.393926         0.522334          0.755387        0.926565   
        523  -0.238589         0.199243          0.050392       -0.438788   

             concave points_mean  ...  perimeter_worst  area_worst  \
cluster                           ...                                
0       78              2.875535  ...         1.877663    1.305104   
        254             0.954141  ...         1.663205    1.918616   
        404            -0.732011  ...        -0.689578   -0.610845   
        282             1.179323  ...         1.344497    1.313894   
        523            -0.286033  ...        -0.233259   -0.314109   

             smoothness_worst  compactness_worst  concavity_worst  \
cluster                                                             
0       78           1.382207           2.303684         2.379147   
        254          0.759738           0.393357         0.765260   
        404         -1.208494          -1.188468        -1.069745   
        282          0.851793           0.767407         0.764781   
        523          0.444119           0.014854        -0.377510   

             concave points_worst  symmetry_worst  fractal_dimension_worst  \
cluster                                                                      
0       78               2.073768        4.107940                 0.869706   
        254              1.298734        0.773694                 0.307790   
        404             -1.015252       -0.975127                -1.341385   
        282              1.683967        1.115046                -0.336696   
        523              0.210032       -0.083729                 0.352676   

             Unnamed: 32  cluster  
cluster                            
0       78           NaN        0  
        254          NaN        0  
        404          NaN        0  
        282          NaN        0  
        523          NaN        0  

[5 rows x 34 columns]
In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
In [31]:
summary_stats = df.describe()
In [32]:
correlation_matrix = df.corr()
In [33]:
sns.countplot(x='diagnosis', data=df)
plt.title('Distribution of Diagnosis (0: B, 1: M)')
plt.show()
In [34]:
df = df.dropna(axis=1)
print("DataFrame after dropping columns with missing data:")
print(df)
DataFrame after dropping columns with missing data:
           id  diagnosis  radius_mean  texture_mean  perimeter_mean  \
0      842302          1     1.097064     -2.073335        1.269934   
1      842517          1     1.829821     -0.353632        1.685955   
2    84300903          1     1.579888      0.456187        1.566503   
3    84348301          1    -0.768909      0.253732       -0.592687   
4    84358402          1     1.750297     -1.151816        1.776573   
..        ...        ...          ...           ...             ...   
564    926424          1     2.110995      0.721473        2.060786   
565    926682          1     1.704854      2.085134        1.615931   
566    926954          1     0.702284      2.045574        0.672676   
567    927241          1     1.838341      2.336457        1.982524   
568     92751          0    -1.808401      1.221792       -1.814389   

     area_mean  smoothness_mean  compactness_mean  concavity_mean  \
0     0.984375         1.568466          3.283515        2.652874   
1     1.908708        -0.826962         -0.487072       -0.023846   
2     1.558884         0.942210          1.052926        1.363478   
3    -0.764464         3.283553          3.402909        1.915897   
4     1.826229         0.280372          0.539340        1.371011   
..         ...              ...               ...             ...   
564   2.343856         1.041842          0.219060        1.947285   
565   1.723842         0.102458         -0.017833        0.693043   
566   0.577953        -0.840484         -0.038680        0.046588   
567   1.735218         1.525767          3.272144        3.296944   
568  -1.347789        -3.112085         -1.150752       -1.114873   

     concave points_mean  ...  texture_worst  perimeter_worst  area_worst  \
0               2.532475  ...      -1.359293         2.303601    2.001237   
1               0.548144  ...      -0.369203         1.535126    1.890489   
2               2.037231  ...      -0.023974         1.347475    1.456285   
3               1.451707  ...       0.133984        -0.249939   -0.550021   
4               1.428493  ...      -1.466770         1.338539    1.220724   
..                   ...  ...            ...              ...         ...   
564             2.320965  ...       0.117700         1.752563    2.015301   
565             1.263669  ...       2.047399         1.421940    1.494959   
566             0.105777  ...       1.374854         0.579001    0.427906   
567             2.658866  ...       2.237926         2.303601    1.653171   
568            -1.261820  ...       0.764190        -1.432735   -1.075813   

     smoothness_worst  compactness_worst  concavity_worst  \
0            1.307686           2.616665         2.109526   
1           -0.375612          -0.430444        -0.146749   
2            0.527407           1.082932         0.854974   
3            3.394275           3.893397         1.989588   
4            0.220556          -0.313395         0.613179   
..                ...                ...              ...   
564          0.378365          -0.273318         0.664512   
565         -0.691230          -0.394820         0.236573   
566         -0.809587           0.350735         0.326767   
567          1.430427           3.904848         3.197605   
568         -1.859019          -1.207552        -1.305831   

     concave points_worst  symmetry_worst  fractal_dimension_worst  cluster  
0                2.296076        2.750622                 1.937015        0  
1                1.087084       -0.243890                 0.281190        1  
2                1.955000        1.152255                 0.201391        1  
3                2.175786        6.046041                 4.935010        1  
4                0.729259       -0.868353                -0.397100        1  
..                    ...             ...                      ...      ...  
564              1.629151       -1.360158                -0.709091        1  
565              0.733827       -0.531855                -0.973978        0  
566              0.414069       -1.104549                -0.318409        0  
567              2.289985        1.919083                 2.219635        0  
568             -1.745063       -0.048138                -0.751207        0  

[569 rows x 33 columns]
In [192]:
unwantedcolumnlist=["diagnosis","id"]
In [193]:
x= df.drop(unwantedcolumnlist,axis=1)
In [196]:
y = df['diagnosis']
In [197]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
In [198]:
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
Out[198]:
LogisticRegression(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=42)
In [199]:
y_pred = model.predict(x_test)
In [200]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9736842105263158
In [201]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:','\n',cm)
Confusion Matrix: 
 [[70  1]
 [ 2 41]]
In [202]:
sns.heatmap(cm, annot=True)
Out[202]:
<Axes: >
In [203]:
print('Classification Report:','\n',classification_report(y_test, y_pred))
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

In [204]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
ros = RandomOverSampler(random_state=42)
x_res, y_res = ros.fit_resample(x, y)

rus = RandomUnderSampler(random_state=42)
x_res, y_res = rus.fit_resample(x, y)
In [205]:
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=42)
In [206]:
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
Out[206]:
LogisticRegression(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=42)
In [207]:
y_pred = model.predict(x_test)
In [208]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9764705882352941
In [209]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:','\n',cm)
Confusion Matrix: 
 [[44  2]
 [ 0 39]]
In [210]:
print('Classification Report:','\n',classification_report(y_test, y_pred))
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98        46
           1       0.95      1.00      0.97        39

    accuracy                           0.98        85
   macro avg       0.98      0.98      0.98        85
weighted avg       0.98      0.98      0.98        85

This Classification Report suggests that the model has performed very well, with high precision, recall, and F1-scores for both classes, as well as a high overall accuracy of 0.98.

In [ ]: